# General packages
library(tidyverse)
## -- Attaching packages ------------------------------------------------------ tidyverse 1.2.1 --
## v ggplot2 3.1.0 v purrr 0.3.0
## v tibble 2.0.1 v dplyr 0.8.0.1
## v tidyr 0.8.2 v stringr 1.4.0
## v readr 1.3.1 v forcats 0.4.0
## -- Conflicts --------------------------------------------------------- tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
library(janitor)
library(plotly)
##
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
##
## last_plot
## The following object is masked from 'package:stats':
##
## filter
## The following object is masked from 'package:graphics':
##
## layout
library(RColorBrewer)
# Packages for cluster analysis:
library(NbClust)
library(cluster)
library(factoextra)
## Welcome! Related Books: `Practical Guide To Cluster Analysis in R` at https://goo.gl/13EFCZ
library(dendextend)
##
## ---------------------
## Welcome to dendextend version 1.9.0
## Type citation('dendextend') for how to cite the package.
##
## Type browseVignettes(package = 'dendextend') for the package vignette.
## The github page is: https://github.com/talgalili/dendextend/
##
## Suggestions and bug-reports can be submitted at: https://github.com/talgalili/dendextend/issues
## Or contact: <tal.galili@gmail.com>
##
## To suppress this message use: suppressPackageStartupMessages(library(dendextend))
## ---------------------
##
## Attaching package: 'dendextend'
## The following object is masked from 'package:stats':
##
## cutree
library(ggdendro)
##
## Attaching package: 'ggdendro'
## The following object is masked from 'package:dendextend':
##
## theme_dendro
# Packages for text mining/sentiment analysis/word cloud
library(pdftools)
library(tidytext)
library(wordcloud)
a number of randomly placed centroids then assigns then recalcs etc etc
#lets just use the iris data
iris_nice <-iris %>%
clean_names()
# the janitor package turns col names to snake case yay :)
ggplot(iris_nice) +
geom_point(aes(x= petal_length, y= petal_width, color = species))
#without the color you would not know there are 3 instead of 2 clusters
How many clusters should exist? What does R think? Useful if not clear species identifier or something like that
#returns best number of clusters (exclude species info)
number_est <- NbClust(iris_nice[1:4], min.nc = 2, max.nc = 10, method = "kmeans")
## *** : The Hubert index is a graphical method of determining the number of clusters.
## In the plot of Hubert index, we seek a significant knee that corresponds to a
## significant increase of the value of the measure i.e the significant peak in Hubert
## index second differences plot.
##
## *** : The D index is a graphical method of determining the number of clusters.
## In the plot of D index, we seek a significant knee (the significant peak in Dindex
## second differences plot) that corresponds to a significant increase of the value of
## the measure.
##
## *******************************************************************
## * Among all indices:
## * 10 proposed 2 as the best number of clusters
## * 8 proposed 3 as the best number of clusters
## * 2 proposed 4 as the best number of clusters
## * 1 proposed 5 as the best number of clusters
## * 1 proposed 7 as the best number of clusters
## * 1 proposed 8 as the best number of clusters
## * 1 proposed 10 as the best number of clusters
##
## ***** Conclusion *****
##
## * According to the majority rule, the best number of clusters is 2
##
##
## *******************************************************************
#square brackets are base r select function "use 1 through 4 column of iris nice"
#it says 2 is best but we are gonna use 3 because of our human brains know
Kmeans clustering with 3 groups
iris_km <- kmeans(iris_nice[1:4],3)
#"with the columns 1-4 do kmeans with 3 clusters"
iris_km$size
## [1] 62 38 50
#how mant obs in each cluster
iris_km$centers
## sepal_length sepal_width petal_length petal_width
## 1 5.901613 2.748387 4.393548 1.433871
## 2 6.850000 3.073684 5.742105 2.071053
## 3 5.006000 3.428000 1.462000 0.246000
#shows center location for each variable for each cluster
iris_km$cluster
## [1] 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3
## [36] 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 1 1 2 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [71] 1 1 1 1 1 1 1 2 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 1 2 2 2
## [106] 2 1 2 2 2 2 2 2 1 1 2 2 2 2 1 2 1 2 1 2 2 1 1 2 2 2 2 2 1 2 2 2 2 1 2
## [141] 2 2 1 2 2 2 1 2 2 1
#what cluster each obs assigned to
iris_cl <- data.frame(iris_nice, cluster_no=factor(iris_km$cluster))
#take the previous info and put in data frame with original data (make new column)
######################################################################################
ggplot(iris_cl) +
geom_point(aes(x = sepal_length, y = sepal_width, color = cluster_no))
#we can see the new clusters!
ggplot(iris_cl) +
geom_point(aes(x= petal_length,
y=petal_width,
color = cluster_no,
pch = species)) + scale_color_brewer((palette = "Set2"))
#now we can see how species fit into this
plot_ly(x= iris_cl$petal_length,
y=iris_cl$sepal_length,
z= iris_cl$sepal_length,
type = "scatter3d",
color= iris_cl$cluster_no,
symbol = iris_cl$species,
colors ="Set1")
## No scatter3d mode specifed:
## Setting the mode to markers
## Read more about this attribute -> https://plot.ly/r/reference/#scatter-mode
#plotly make fancy 3d plot that is interactive